Loading data

This is a function that will load all packages required for the analysis. If you dont have any of these packages installed; install_packages(‘dplyr’) for example, will install the package. Then you can load it using library(‘dplyr’)

load_packages<- function(){library(SingleCellExperiment)
  library(dplyr)
  library(scran)
  library(scater)
  library(Seurat)
  library(magrittr)
  library(BiocGenerics)
  library(AnnotationHub)
  library(EnsDb.Hsapiens.v86)
  library(DESeq2)
  library(edgeR)
  library(Seurat)
  library(forcats)
  library(ggplot2)
  library(reshape2)
  library(ggrepel)
  library(batchelor)
  library(stringr)
  library(RColorBrewer)
  library(SC3)
  library(dendextend)
  library(dynamicTreeCut)
  library(limma)
  library(SeuratWrappers)
  library(Nebulosa)
  library(cluster, quietly = TRUE)
  library(harmony)
  library(clustree)
  library(destiny)
  library(plotly)
  library(org.Hs.eg.db)
  library(edgeR)
  library(ComplexHeatmap)
  library(circlize)
  library(VennDiagram)
}
load_packages()

This is a function to retrieve human genome annotations e.g. descriptions and type e.g. rRNA, non-coding RNA etc for all human genes

####'GET GENOME ANNOTATIONS ####
load_annotations <- function(){
  ah <- AnnotationHub::AnnotationHub()
  edb <- ah[["AH83216"]]
  annotations <- ensembldb::genes(edb, return.type="data.frame")  %>%
    dplyr::select(gene_id, gene_name, gene_biotype, description, symbol)
  return(annotations)
}
annotations <- load_annotations()
## snapshotDate(): 2020-04-27
## loading from cache

Load and format data

Firstly, load count data (genes x samples matrix). I set the root directory as well to make it easier to save images/tables etc. Also set a theme for graphs and a colour pallete used for all the graphs.

##### LOAD DATA #####
X <- read.csv('/hpc/scratch/hdd2/fs541623/scRNAseq/Run1_3_Cisplatin_080721/Secondary_analysis/Copy of UMI_matrix_150821.csv')
root <- '/hpc/scratch/hdd2/fs541623/scRNAseq/Run1_3_Cisplatin_080721/Secondary_analysis'
theme_set(theme_bw())
cc <- c(brewer.pal(8, "Set2"),brewer.pal(9,"Set3"))
head(X[,1:4])
##              gene.id       Gene Control_day1 Control_day1.1
## 1  ENSG00000268903.1 AL627309.6            0              0
## 2 ENSG00000228463.10 AP006222.1            0              0
## 3  ENSG00000225972.1   MTND1P23            0              1
## 4  ENSG00000225630.1   MTND2P28            0              2
## 5  ENSG00000237973.1   MTCO1P12            0              0
## 6  ENSG00000229344.1   MTCO2P12            0              0

Need to format data. Make sure the rownames are the Ensembl gene ids. Create a single cell experiment object as it helps accessing the data more easily. A summary about single cell experiment (SCE) can be found at [orchestrating single cell analysis rmarkdown book]: (https://bioconductor.org/books/release/OSCA/). Save the SCE object. Assign metadata to cells e.g. time, day etc and store in SCE

######### FORMAT DATA ##########
Y <- X %>% dplyr::select(-c(Gene, gene.id))
rownames(Y) <- X$gene.id
sce_3 <- SingleCellExperiment(list(counts=Y), rowData=DataFrame(symbol=X$Gene)) %>% 
  set_rownames(uniquifyFeatureNames(rownames(.), rowData(.)$symbol))
#saveRDS(sce_3, paste0(root, '/new_SCE_objects/FINAL_WORKFLOW/raw_sce.rds'))

## get sample labels ##
sce_3$sample <- str_extract(colnames(sce_3), '\\d+h')
sce_3$sample[is.na(sce_3$sample)] <- '0h'
sce_3$sample <- factor(sce_3$sample, levels=c('0h', '2h', '8h','16h','24h', '48h','72h'))

## get day labels ##
days <- str_extract(colnames(Y), '.*_day\\d+')
days[is.na(days)] <- 'Other'
sce_3$day <- days
sce_3
## class: SingleCellExperiment 
## dim: 18286 219 
## metadata(0):
## assays(1): counts
## rownames(18286): AL627309.6 AP006222.1 ... RNU6-184P CD24P4
## rowData names(1): symbol
## colnames(219): Control_day1 Control_day1.1 ... X72h.30 X72h.31
## colData names(2): sample day
## reducedDimNames(0):
## altExpNames(0):

Quality control

QC to remove failed/low quality libraries as these will confound downstream analysis by introducing lots of technical noise. Exacerbating fold changes during differential expression testing and causing clusterning due to technical noise not relevant biological information. Proportion of mitochondrial (MT) counts is a good metric to store; usually a high MT is indicative of low quality libraries e.g. apoptotic processes and mtDNA enrichment can sometimes imply loss of other RNA from the cell. rRNA also might be interesting to check if rRNA depletion was successful during library preparation.

This function adds per cell QC metrics e.g. total UMI counts per cell, MT counts per cell, genes per cell, rRNA counts per cell. These are stored in the column data of the SCE object.

########### QC #############################
counts(sce_3) <- data.matrix(counts(sce_3))
sce_3 <- addPerCellQC(sce_3, 
  subsets=list(mito=grepl('^MT-', rowData(sce_3)$symbol), 
  ribo=grepl('^RP', rowData(sce_3)$symbol)))
head(colData(sce_3)[,1:11])
## DataFrame with 6 rows and 11 columns
##                  sample          day       sum  detected percent_top_50
##                <factor>  <character> <integer> <integer>      <numeric>
## Control_day1         0h Control_day1      3269      1004        25.8489
## Control_day1.1       0h Control_day1      3814      1130        26.9009
## Control_day1.2       0h Control_day1       366       180        58.7432
## Control_day2         0h Control_day2      1493       612        31.2793
## Control_day1.3       0h Control_day1      1689       646        29.7809
## Control_day1.4       0h Control_day1       100        48       100.0000
##                percent_top_100 percent_top_200 percent_top_500 subsets_mito_sum
##                      <numeric>       <numeric>       <numeric>        <integer>
## Control_day1           38.4215         56.1946         83.0223               82
## Control_day1.1         38.6733         54.3262         79.4966              293
## Control_day1.2         78.1421        100.0000        100.0000                3
## Control_day2           47.1534         66.2425         92.4983               56
## Control_day1.3         45.1747         65.3641         91.3558               64
## Control_day1.4        100.0000        100.0000        100.0000                2
##                subsets_mito_detected subsets_mito_percent
##                            <integer>            <numeric>
## Control_day1                      16             2.508412
## Control_day1.1                    17             7.682223
## Control_day1.2                     3             0.819672
## Control_day2                      11             3.750837
## Control_day1.3                    13             3.789224
## Control_day1.4                     2             2.000000

You can then begin to assess the QC metrics through QC plots.To change the plot supply a different column name for x or y e.g. y= ‘detected’ instead of ‘sum’ would show genes detected. log Scale the y-axis to identify the lower tail outliers. Then we discard cells based on a combination of metrics to avoid discarding relevant biological heterogeneity. (Some of these thresholds were based on previous QC experiment)

#### QC plots ####
p1 <- plotColData(sce_3, x="sample", y="sum") + scale_y_log10() 
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/QC_sum_logscale.png'))
discard <- sce_3$sum < 600 | sce_3$subsets_mito_percent < 1 | sce_3$sum > 20000 | sce_3$detected < 500
sce_3$discard <- discard
filtered_sce_3 <-sce_3[,!(discard)]
filtered_sce_3$lowgenes <- filtered_sce_3$detected < 500
p2 <- plotColData(filtered_sce_3, x="sample", y="sum", colour_by = 'lowgenes') + scale_y_log10() 
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/QC_sum_logscale_afterfiltering.png'))
cowplot::plot_grid(p1 + ggtitle('Before QC'),p2 + ggtitle('After QC'))

Normalization

Normalization is necessary to correct for variability in sequencing depth caused by technical factors. It also helps give equal weight to low and high abundance genes so that high abundance genes (which aren’t usually the most interesting) dominante downstream analysis (Variance stabilization). This paper explains it well. [normalization paper]: https://www.biorxiv.org/content/biorxiv/early/2021/07/09/2021.07.07.451498.full.pdf. In summary, there are 2 methods to normalize ; using size factors to scale the gene expressions per cell, or use statistical distributions to model the data. E.g. UMI counts often follow a negative binomial distribution. The gene dispersions from the model are then estimated and technical variability ‘removed’. The high % of zeros in scRNAseq causes difficulty in normalization (zero inflated data). Another approach that I havn’t been able to try but looks interesting is using the package [scone]:https://www.bioconductor.org/help/course-materials/2016/BioC2016/ConcurrentWorkshops1/Risso/scone.html and compared different normalization methods and ‘chooses’ the most appropriate one.

In this chunk, we set the seed (SCTransform can be random so setting the seed ensures you get the same results each time). Then we have to convert between a SCE and seurat object to do the normalization etc. Seurat is also more intuitive than SCE. We had MT percentage counts as ‘mito.percent’, define high mito counts > 20%, and normalize the data using [SCTransform]:https://satijalab.org/seurat/articles/sctransform_vignette.html. This function also identifies the top highly variable genes (often the most important). Selective the most informative genes helps reduce technical noise further , and these genes are used for principal component analysis (PCA) which is a dimensionality reduction technique. ‘SCT’ assay contains the normalized counts and ‘RNA’ assay contains the unormalized counts.

############## NORMALIZATION ################### 
set.seed(123)
s <- CreateSeuratObject(counts(filtered_sce_3)) %>%
  PercentageFeatureSet(.,pattern='MT-', col.name='mito.percent') %>%
  AddMetaData(.$mito.percent > 20, col.name='high.mito') %>% 
  SCTransform(., verbose=F, return.only.var.genes = F, 
  variable.features.n = 1000)%>%
  RunPCA(assay='SCT', features=VariableFeatures(.))

Write the most informative genes (HVG) to a file with their associated gene annotations

### Write HVG to file ### 
top_hvg <- HVFInfo(s) %>% 
  mutate(., bc = rownames(.)) %>% 
  arrange(desc(residual_variance)) %>% 
  top_n(1000, residual_variance)  %>% 
  rename(gene_name=bc) 
hvg.anno <- annotations %>% 
  dplyr::filter(gene_name %in% top_hvg$gene_name) %>% 
  left_join(., top_hvg, by='gene_name') %>% 
  arrange(desc(residual_variance))
#write.csv(hvg.anno, paste0(root, '/new_tables/FINAL_WORKFLOW/hvglist.csv'))
head(hvg.anno)
##           gene_id gene_name   gene_biotype
## 1 ENSG00000100526     CDKN3 protein_coding
## 2 ENSG00000210082   MT-RNR2        Mt_rRNA
## 3 ENSG00000114023   FAM162A protein_coding
## 4 ENSG00000154719    MRPL39 protein_coding
## 5 ENSG00000005448     WDR54 protein_coding
## 6 ENSG00000142546     NOSIP protein_coding
##                                                                        description
## 1           cyclin dependent kinase inhibitor 3 [Source:HGNC Symbol;Acc:HGNC:1791]
## 2              mitochondrially encoded 16S rRNA [Source:HGNC Symbol;Acc:HGNC:7471]
## 3 family with sequence similarity 162 member A [Source:HGNC Symbol;Acc:HGNC:17865]
## 4          mitochondrial ribosomal protein L39 [Source:HGNC Symbol;Acc:HGNC:14027]
## 5                          WD repeat domain 54 [Source:HGNC Symbol;Acc:HGNC:25770]
## 6    nitric oxide synthase interacting protein [Source:HGNC Symbol;Acc:HGNC:17946]
##    symbol     gmean   variance residual_variance
## 1   CDKN3 0.2115648   3.722939          5.595760
## 2 MT-RNR2 2.9117087 150.625112          5.433657
## 3 FAM162A 0.3615786   5.378336          5.041629
## 4  MRPL39 0.2122900   2.669565          4.996713
## 5   WDR54 0.1824233   4.007496          4.839889
## 6   NOSIP 0.1549822   1.288756          4.839074

Dimensionality reduction

Can also write PCA loadings to a file with their associated gene annotations. PCA loadings ([pca explained]:https://www.youtube.com/watch?v=FgakZw6K1QQ) are the ‘scores’ given by the PCA algorithm showing how much each gene contributes to the variance of a given principal component. Here we arrange PC1 and PC2 loadings from highest to lowest.

### Write PCA loadings to file ###
PCAloadings1 <- Loadings(s) %>% 
  as.data.frame(.) %>% dplyr::select(PC_1, PC_2) %>% 
  tibble::rownames_to_column(var='gene_name') %>% 
  left_join(., annotations, by='gene_name') %>% 
  arrange(-PC_1)
PCAloadings2 <- Loadings(s) %>% 
  as.data.frame(.) %>% dplyr::select(PC_1, PC_2) %>% 
  tibble::rownames_to_column(var='gene_name') %>% 
  left_join(., annotations, by='gene_name') %>% 
  arrange(-PC_2)
#write.csv(PCAloadings1, paste0(root, '/new_tables/FINAL_WORKFLOW/PC1_loading_ordered.csv'))
#write.csv(PCAloadings2, paste0(root, '/new_tables/FINAL_WORKFLOW/PC2_loading_ordered.csv'))
head(PCAloadings1)
##   gene_name      PC_1        PC_2         gene_id   gene_biotype
## 1      MT1X 0.2702030  0.11638755 ENSG00000187193 protein_coding
## 2       FTL 0.2077366  0.02398383 ENSG00000087086 protein_coding
## 3  HIST1H4C 0.1838079 -0.03790760            <NA>           <NA>
## 4     COX5A 0.1440578 -0.03729979 ENSG00000178741 protein_coding
## 5    CXCL14 0.1301379  0.08188350 ENSG00000145824 protein_coding
## 6  SERPINA1 0.1283917  0.09347299 ENSG00000197249 protein_coding
##                                                           description   symbol
## 1               metallothionein 1X [Source:HGNC Symbol;Acc:HGNC:7405]     MT1X
## 2             ferritin light chain [Source:HGNC Symbol;Acc:HGNC:3999]      FTL
## 3                                                                <NA>     <NA>
## 4  cytochrome c oxidase subunit 5A [Source:HGNC Symbol;Acc:HGNC:2267]    COX5A
## 5 C-X-C motif chemokine ligand 14 [Source:HGNC Symbol;Acc:HGNC:10640]   CXCL14
## 6         serpin family A member 1 [Source:HGNC Symbol;Acc:HGNC:8941] SERPINA1

Choosing the optimum number of principal components (PCs) to summarise the variance within the data is difficult. Horns analysis is a method which compares PCA ran on your data to PCA ran on random data and chooses the number of PCs which are contribute higher than random variation. Elbow plot is another method to assess the amount of variation explained by each principal component. The number of principal components when the graph reaches a point of inflexion is the ‘optimum’ PC. Here both give different results and testing results with various PC is often done.

############# DIM REDUCTION ###################
horns <- PCAtools::parallelPCA(GetAssayData(s, assay='SCT', slot='scale.data')[VariableFeatures(s),])
horns$n ## 23 
## [1] 23
ElbowPlot(s) + ggtitle('Elbow plot') ## 10

#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/Elbow_plot.png'))
s$sample <- filtered_sce_3$sample
s$day <- filtered_sce_3$day

PCA plot using data stored in seurat object

########### PCA #############
to.plot <- as.data.frame(Embeddings(s[['pca']])) %>% 
  mutate(sample=s$sample) 
p1 <- ggplot(to.plot) + 
  geom_point(aes(PC_1, PC_2, color=sample), size=2, alpha=0.8)+
  coord_fixed(1) +
  geom_vline(xintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  geom_hline(yintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  scale_colour_manual(values=c(cc[1], cc[2], cc[3], cc[4], cc[5], cc[6], cc[7]), 
  guide = guide_legend(override.aes = list(size = 5), title='Timepoint'))
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/PCA_plot.png'))

To assess day of sampling effect i.e. see if any separation is caused by day of sampling which isnt important biological heterogeneity, we ran PCA on a subset of the data and assessed if there was separation due to day fo sampling. Here PCA doesnt seem to separate the groups as well. Important to note that these are all the same cell types so there isn’t much heterogeneity. Mixing between day of sampling implies

## Run PCA on subset ##
s.s <- subset(s, subset=day!='Other')
s.s <- s.s %>% RunPCA() 
to.plot <- as.data.frame(Embeddings(s.s[['pca']])) %>% 
  mutate(day=s.s$day) 
p2 <- ggplot(to.plot) + 
  geom_point(aes(PC_1, PC_2, color=day), size=2)+
  coord_fixed(1) +
  geom_vline(xintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  geom_hline(yintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  scale_colour_manual(values=c(cc[1], cc[2], cc[3], cc[4], cc[5], cc[6], cc[7]), 
  guide = guide_legend(override.aes = list(size = 5), title='Timepoint'))
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/PCA_plot_batcheffect.png'))
cowplot::plot_grid(p1+ggtitle('PCA'),p2+ggtitle('PCA to assess day of \n sampling variation'))

Multi dimensional scaling is another linear dimensionality reduction technique like PCA. They are both very similar but MDS preserves distances between points wheraeas PCA preserves covariance between points.[pca and mds]: https://www.quora.com/Whats-the-difference-between-MDS-and-PCA

We do the same here as we did with PCA. ‘scale.data’ slot of the ‘SCT’ assay contains the data used to run PCA and MDS.

########## MDS ###############
d <- dist(t(GetAssayData(s, slot = "scale.data")))
mds <- cmdscale(d = d, k = 2)
colnames(mds) <- paste0("MDS_", 1:2)
s[["mds"]] <- CreateDimReducObject(embeddings = mds, key = "MDS_", assay = DefaultAssay(s))
to.plot <- as.data.frame(Embeddings(s[['mds']])) %>% 
  mutate(sample=s$sample) 
p1 <- ggplot(to.plot) + 
  geom_point(aes(MDS_1, MDS_2, color=sample), size=2)+
  coord_fixed(1) +
  geom_vline(xintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  geom_hline(yintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  scale_colour_manual(values=c(cc[1], cc[2], cc[3], cc[4], cc[5], cc[6], cc[7]), 
  guide = guide_legend(override.aes = list(size = 5), title='Timepoint'))
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/mds_plot.png'))
## Run MDS on subset ##
d <- dist(t(GetAssayData(s.s, slot = "scale.data")))
mds <- cmdscale(d = d, k = 2)
colnames(mds) <- paste0("MDS_", 1:2)
s.s[["mds"]] <- CreateDimReducObject(embeddings = mds, key = "MDS_", assay = DefaultAssay(s.s))
to.plot <- as.data.frame(Embeddings(s.s[['mds']])) %>% 
  mutate(day=s.s$day) 
p2 <- ggplot(to.plot) + 
  geom_point(aes(MDS_1, MDS_2, color=day), size=2)+
  coord_fixed(1) +
  geom_vline(xintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  geom_hline(yintercept=0, linetype='dashed', colour='#777777', size=0.2) +
  scale_colour_manual(values=c(cc[1], cc[2], cc[3], cc[4], cc[5], cc[6], cc[7]), 
  guide = guide_legend(override.aes = list(size = 5), title='Timepoint'))
#ggsave(paste0(root, '/new_visualisations/FINAL_WORKFLOW/mds_plot_batcheffect.png'))
cowplot::plot_grid(p1+ggtitle('MDS'),p2+ggtitle('MDS to assess day of \n sampling variation'))

### 3D plots ###
df <- as.data.frame(Embeddings(s, reduction='mds')) %>% mutate(sample=s$sample)
plot_ly(data = df, x=df$MDS_1, y=df$MDS_2, z=df$MDS_3, type="scatter3d", mode="markers", color=df$sample, size = 2)